predicted.score threshold 0.5The aim is to explore the annotations and label transfers for all of the samples in SCPCP000006.
In order to explore the label transfer results, we look into some marker genes, table and percentages of cells in each annotation groups (from label transfers).
We also looked into the predicted.score for each of the
compartments and explored the potential effects of score thresholds. In
this notebook, we fixed the predicted.score threshold to
0.5.
library(Seurat)
library(tidyverse)
library(tidyr)
library(patchwork)
library(DT)# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")In this notebook, we are working with all of the samples in SCPCP000006.
The sample metadata can be found in sample_metadata_file
in the data folder.
We extracted from the pre-processed and labeled Seurat
object (that is the output of
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved
in the results directory) the following information per
cell:
the predicted compartment in
fetal_kidney_predicted.compartment
the predicted organ in
fetal_full_predicted.organ
the sample identifier in sample_id
the counts of the endothelial marker
VWF = "ENSG00000110799"
the counts of the immune marker
PTPRC = "ENSG00000081237"
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read_tsv(sample_metadata_file) |>
dplyr::filter(seq_unit != "spot")
# Create a data frames of all annotations
cell_type_df <- metadata$scpca_sample_id |>
purrr::map(
# For each sample_id, do the following:
\(sample_id) {
input_file <- file.path(
result_dir,
sample_id,
glue::glue("02b-fetal_kidney_label-transfer_{sample_id}.Rds")
)
# The file may not be present if this is being run in CI, which is ok.
# If we are not running in CI and the file doesn't exist, we should error out
# We should error out if the file does not exist and we are NOT testing
if (!file.exists(input_file)) {
if (params$testing) {
return(NULL)
} else {
stop("Input RDS file does not exist.")
}
}
# Read in the Seurat object
srat <- readRDS(input_file)
# Create and return a data frame from the Seurat object with relevant annotations
# this data frame will have six columns: barcode, compartment, organ, compartment_score, VWF and PTPRC
data.frame(
compartment = srat$fetal_kidney_predicted.compartment,
compartment_score = srat$fetal_kidney_predicted.compartment.score,
organ = srat$fetal_full_predicted.organ,
PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
umap = srat@reductions$umap@cell.embeddings
) |>
tibble::rownames_to_column("barcode") |>
dplyr::mutate(sample_id = sample_id)
}
) |>
# now combine all dataframes to make one big one
dplyr::bind_rows() |>
# Combine with metadata
dplyr::left_join(metadata, by = c("sample_id" = "scpca_sample_id")) |>
# Create column for whether the compartment score passes threshold
dplyr::mutate(pass_mapping_QC = compartment_score > params$predicted.score_thr)
# we create a data frames of annotation of cells that pass the `predicted.score` threshold
cell_type_df_pass <- cell_type_df |>
dplyr::filter(pass_mapping_QC)
# we create a data frames of annotation of cells that don't pass the `predicted.score` threshold
cell_type_df_nopass <- cell_type_df |>
dplyr::filter(!pass_mapping_QC)The report will be saved in the notebook directory.
do_Feature_mean shows a heatmap of mean expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellsfeature is the name of the gene to average the
expressiondo_Feature_mean <- function(df, group.by, feature) {
df <- df %>%
group_by(sample_id, !!sym(group.by)) %>%
summarise(m = mean(!!sym(feature)))
p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5), title = element_text(size = rel(0.75))) +
guides(fill = guide_colourbar(title = paste0(feature)))
return(p)
}do_Feature_boxplot shows boxplot of expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsfeature is the name of the gene to average the
expressiongroup.by is the name of the metadata to group the
cellssplit.by is the name of the metadata to split the
plotsdo_Feature_boxplot <- function(df, group.by, feature, split.by) {
df <- df %>%
mutate(!!sym(group.by), group = factor(!!sym(group.by), levels = c("fetal_nephron", "stroma", "immune", "endothelium")))
p <- ggplot(
df,
aes(x = group, y = !!sym(feature), fill = group)
) +
geom_boxplot(size = 0.5, size.outlier = 0.25) +
facet_wrap(vars(!!sym(split.by)), scale = "free_y", ncol = 4) +
theme_bw() +
theme(
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5, vjust = 0),
legend.position = "none"
)
return(p)
}do_Feature_densityplot shows boxplot of expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellsfeature is the name of the gene to average the
expressiondo_Feature_densityplot <- function(df, group.by, feature) {
df <- df %>%
mutate(!!sym(group.by), group = factor(!!sym(group.by), levels = c("fetal_nephron", "stroma", "immune", "endothelium")))
p <- ggplot(
df,
aes(x = !!sym(feature), fill = group)
) +
geom_density() +
facet_wrap(vars(!!sym(group.by)), scale = "free_y", ncol = 4) +
theme_bw()
return(p)
}The predicted organ comes from label transfer from the human fetal
atlas. This is the reference developed by Cao et
al. provided by Azimuth for label transfer. The
reference contain cells from 15 organs including kidney from fetal
samples. The label transfer have been performed in the notebook
02a_fetal_full_reference_Cao_{sample_id}.Rmd
This notebook looks at the fetal_full_predicted.organ
annotation from this reference.
Here, we are summarizing per sample:
the percentage of cells labeled as kidney, after label transfer from the fetal full reference Cao et al.,
the percentage of cells not labeled as kidney, after label transfer from the fetal full reference Cao et al.,
the total number of cells per sample.
kidney_count_df <- cell_type_df |>
# make a new variable that tells us if it's kidney or not
dplyr::mutate(organ_type = ifelse(organ == "Kidney", "kidney", "not_kidney")) |>
# count how many kidney or not
dplyr::count(sample_id, organ_type) |>
# make the data frame wide and use a value of 0 if a count is missing
tidyr::pivot_wider(names_from = organ_type, values_from = n, values_fill = 0) |>
# add a column for total, and convert other counts to percentages
dplyr::mutate(
total = kidney + not_kidney,
kidney = round(kidney / total, 4) * 100,
not_kidney = round(not_kidney / total, 4) * 100
) |>
# arrange in descending order of percentage of kidney cells
dplyr::arrange(desc(kidney))
DT::datatable(kidney_count_df,
caption = "percentage of cells mapping to the predicted organ kidney",
extensions = "Buttons",
options = list(
dom = "Bfrtip",
buttons = c("csv", "excel")
)
)The predicted compartment is the result of the label transfer from
the human fetal kidney atlas: Stewart et
al. created a human fetal
kidney atlas. This reference contains only fetal kidney cells and
has been precisely annotated by kidney experts. The label transfer have
been performed in the notebook
02b_fetal_kidney_reference_Stewart_{sample_id}.Rmd
The fetal kidney reference (Stewart et al.) provides two levels of annotations:
fetal_kidney_predicted.compartment is one of the 4
main compartments composing a fetal kidney. We expect immune and
endothelial cells to be healthy (non-cancerous) cells identified easily
with high confidence. We expect stroma and fetal nephron compartment to
contain both normal and malignant cells.
immune cells
stroma cells
fetal_nephron cells
endothelial cells
For each compartment, we summarized the percentage of cells that do match kidney annotation or not. Please note that this table is not sample-specific but contains all samples pooled into one.
compartment_count_df <- cell_type_df |>
# make a new variable that tells us if it's kidney or not
dplyr::mutate(organ_type = ifelse(organ == "Kidney", "kidney", "not_kidney")) |>
# count how many kidney or not
dplyr::count(compartment, organ_type) |>
# make the data frame wide and use a value of 0 if a count is missing
tidyr::pivot_wider(names_from = organ_type, values_from = n, values_fill = 0) |>
# add a column for total, and convert other counts to percentages
dplyr::mutate(
total = kidney + not_kidney,
kidney = round(kidney / total, 4) * 100,
not_kidney = round(not_kidney / total, 4) * 100
) |>
# arrange in descending order of percentage of kidney cells
dplyr::arrange(desc(kidney))
DT::datatable(compartment_count_df,
caption = "percentage of cells mapping to the predicted organ kidney",
extensions = "Buttons",
options = list(
dom = "Bfrtip",
buttons = c("csv", "excel")
)
)What is the predicted organ of cells that are not labeled as kidney cell? Please note that this table is not sample-specific but contains all samples pooled into one.
compartment_df <- cell_type_df |>
# count how many per compartment
dplyr::count(organ, compartment) |>
# make the table wide and use a value of 0 if a count is missing
tidyr::pivot_wider(names_from = compartment, values_from = n, values_fill = 0) |>
# order the columns how we want to show them
# use any_of in case columns are missing, which will be the case for test data
dplyr::select(dplyr::any_of(c("organ", "fetal_nephron", "stroma", "endothelium", "immune")))
DT::datatable(compartment_df,
caption = "counts of cells in each compartment",
extensions = "Buttons",
options = list(
dom = "Bfrtip",
buttons = c("csv", "excel")
)
)We also checked the number of cell in each compartment per sample, to assess the presence/absence of non-cancer cells (endothelia and immune) that could help the inference of copy number alterations.
compartment_df <- cell_type_df |>
# count how many per compartment
dplyr::count(sample_id, compartment) |>
# make the table wide and use a value of 0 if a count is missing
tidyr::pivot_wider(names_from = compartment, values_from = n, values_fill = 0) |>
# order the columns how we want to show them
# use any_of in case columns are missing, which will be the case for test data
dplyr::select(dplyr::any_of(c("organ", "fetal_nephron", "stroma", "endothelium", "immune")))
DT::datatable(compartment_df,
caption = "counts of cells in each compartment",
extensions = "Buttons",
options = list(
dom = "Bfrtip",
buttons = c("csv", "excel")
)
)The vertical line drawn correspods to the threshold explored in the notebook.
p <- do_Feature_densityplot(
df = cell_type_df,
feature = "compartment_score",
group.by = "compartment"
)
p + geom_vline(xintercept = params$predicted.score_thr)Some cells have an excellent predicted.score (> 0.5)
while other performed worse. Let’s have a look at the compartments per
patient.
p <- do_Feature_boxplot(
df = cell_type_df,
feature = "compartment_score",
group.by = "compartment",
split.by = "sample_id"
)
p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))Regarding the endothelial cells, some patient have very high
predicted.score while others perform poorly. Let’s have a
look if this can be linked to some metadata/clinical data.
p <- do_Feature_boxplot(
df = cell_type_df,
feature = "compartment_score",
group.by = "compartment",
split.by = "treatment"
)
p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))p <- do_Feature_boxplot(
df = cell_type_df,
feature = "compartment_score",
group.by = "compartment",
split.by = "subdiagnosis"
)
p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))p <- do_Feature_boxplot(
df = cell_type_df,
feature = "compartment_score",
group.by = "compartment",
split.by = "disease_timing"
)
p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))We do not really see a pattern linking the label transfer score and some meta/clinical data.
We finally look for each patient and compartment, the number of cells
that have a predicted.score > 0.5 (TRUE).
scores_df <- cell_type_df |>
# count how many per compartment
dplyr::count(sample_id, compartment, pass_mapping_QC) |>
# make the table wide and use a value of 0 if a count is missing
tidyr::pivot_wider(names_from = pass_mapping_QC, values_from = n, values_fill = 0)
DT::datatable(scores_df,
caption = "counts of cells that have a good/poor mapping in each compartment",
extensions = "Buttons",
options = list(
dom = "Bfrtip",
buttons = c("csv", "excel")
)
)Here we evaluate with marker genes the identification of endothelial, immune cells sub-populations. We expect that cells labeled as endothelial and immune cells will have higher expression of these markers compared to other cell types.
We look at the endothelial marker
"ENSG00000110799" = "VWF"
do_Feature_mean(
df = cell_type_df,
feature = "ENSG00000110799",
group.by = "compartment"
) +
ggtitle("VWF expression averaged by compartment for all cells")do_Feature_mean(
df = cell_type_df_pass,
feature = "ENSG00000110799",
group.by = "compartment"
) +
ggtitle("VWF expression averaged by compartment for cells passing the `predicted.score` threshold")do_Feature_mean(
df = cell_type_df_nopass,
feature = "ENSG00000110799",
group.by = "compartment"
) +
ggtitle("VWF expression averaged by compartment for cells that don't pass the `predicted.score` threshold")do_Feature_boxplot(
df = cell_type_df,
feature = "ENSG00000110799",
group.by = "compartment",
split.by = "sample_id"
) +
ggtitle("boxplot of VWF expression for all cells")We look at the immune marker "ENSG00000081237" = "PTPRC"
alias "CD45"
do_Feature_mean(
df = cell_type_df,
feature = "ENSG00000081237",
group.by = "compartment"
) +
ggtitle("PTPRC expression averaged by compartment for all cells")do_Feature_mean(
df = cell_type_df_pass,
feature = "ENSG00000081237",
group.by = "compartment"
) +
ggtitle("PTPRC expression averaged by compartment for cells passing the `predicted.score` threshold")do_Feature_mean(
df = cell_type_df_nopass,
feature = "ENSG00000081237",
group.by = "compartment"
) +
ggtitle("PTPRC expression averaged by compartment for cells that are not passing the predicted score threshold")do_Feature_boxplot(
df = cell_type_df,
feature = "ENSG00000081237",
group.by = "compartment",
split.by = "sample_id"
) +
ggtitle("boxplot of PTPRC expression for all cells")We look at the umap reduction per sample.
Point are colored per compartment:
red = endothelial
green = fetal nephron
cyan = immune
purple = stroma
Black cross show cell that do not pass the
predicted.score threshold.
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = compartment), shape = 19, size = 2) +
geom_point() +
geom_point(data = cell_type_df_nopass, mapping = aes(x = umap.umap_1, y = umap.umap_2), shape = 4, color = "black", alpha = 0.3) +
facet_wrap(facets = ~sample_id)predicted.score thresholdPoint are colored per compartment. Here we only plotted cells that do
pass the predicted.score threshold.
ggplot(cell_type_df_pass, aes(x = umap.umap_1, y = umap.umap_2, color = compartment), shape = 19, size = 2) +
geom_point() +
facet_wrap(facets = ~sample_id)Looking at the predicted.organ label (from the human
fetal atlas, Cao et
al.), we can be confident about the label transfer.
The majority of fetal nephron cell (92%) has been predicted as
kidney. The fact that the other compartments (i.e. stroma,
endothelial and immune) do not really match to kidney cells (less than
one third predicted as kidney) shouldn’t be a concern and and shouldn’t
be interpreted as a poor label transfer or annotation.
It is indeed known that Wilms tumor stroma (sometimes) shows (unexpected) differentiation into cell types such as skeletal muscle cells, fat tissue, cartilage, bone and even glial cells [1-2]. For that reason, we are not surprised that most stroma cells are not predicted as kidney cells.
Regarding the immune compartment, it is not of a surprise that cancer cells, and/or treatment, modulate the immune microenvironment, via the attraction of immune cells that are not usually in the kidney and/or induction of a cancer-associated phenotype. For that reason, we are not surprised that immune cells can be resembling immune cells from other organs.
The next step of the analysis is to test whether copyKAT
can help us distinguishing cancer cells from non-malignant cell types.
copyKAT
is a computational tool using integrative Bayesian approaches to
identify genome-wide aneuploidy in single cells to separate tumor cells
from normal cells, using high-throughput scRNAseq data. The underlying
logic for calculating DNA copy number events from RNAseq data is that
gene expression levels of many adjacent genes can provide depth
information to infer genomic copy number in that region.
Of note, it is known that CopyKAT has difficulty in
predicting tumor and normal cells in the cases of pediatric and liquid
tumors that have a few CNAs, such as Wilms tumors. CopyKAT
provides two ways to bypass this:
by providing a vector of cell names of known normal cells from the same dataset
or by try to search for T cells.
For that reason, we will first try to run CopyKAT on 2-5
samples that have a majority of cells mapping to kidney and a decent
amount (>100 cells when possible) of normal cells (endothelial and
immune cells) and for which we are quite confident with the label
transfer:
sample SCPCS000194
sample SCPCS000179
sample SCPCS000184
sample SCPCS000205
sample SCPCS0000208
For the selected samples, we will run copyKAT with and
without providing a list of normal cells. That will be useful to us to
see how consistent results are. Also it will inform us about how we
might analyze samples that aren’t predicted to have that many (or
sometimes any at all!) endothelial or immune cells.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] DT_0.33 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [13] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 jsonlite_1.8.8 magrittr_2.0.3
## [4] spatstat.utils_3.1-0 farver_2.1.2 rmarkdown_2.28
## [7] vctrs_0.6.5 ROCR_1.0-11 spatstat.explore_3.3-2
## [10] htmltools_0.5.8.1 sass_0.4.9 sctransform_0.4.1
## [13] parallelly_1.38.0 KernSmooth_2.23-24 bslib_0.8.0
## [16] htmlwidgets_1.6.4 ica_1.0-3 plyr_1.8.9
## [19] plotly_4.10.4 zoo_1.8-12 cachem_1.1.0
## [22] igraph_2.0.3 mime_0.12 lifecycle_1.0.4
## [25] pkgconfig_2.0.3 Matrix_1.7-0 R6_2.5.1
## [28] fastmap_1.2.0 fitdistrplus_1.2-1 future_1.34.0
## [31] shiny_1.9.1 digest_0.6.37 colorspace_2.1-1
## [34] rprojroot_2.0.4 tensor_1.5 RSpectra_0.16-2
## [37] irlba_2.3.5.1 crosstalk_1.2.1 labeling_0.4.3
## [40] progressr_0.14.0 fansi_1.0.6 spatstat.sparse_3.1-0
## [43] timechange_0.3.0 httr_1.4.7 polyclip_1.10-7
## [46] abind_1.4-5 compiler_4.4.0 bit64_4.0.5
## [49] withr_3.0.1 fastDummies_1.7.4 highr_0.11
## [52] MASS_7.3-61 tools_4.4.0 lmtest_0.9-40
## [55] httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
## [58] glue_1.7.0 nlme_3.1-166 promises_1.3.0
## [61] grid_4.4.0 Rtsne_0.17 cluster_2.1.6
## [64] reshape2_1.4.4 generics_0.1.3 gtable_0.3.5
## [67] spatstat.data_3.1-2 tzdb_0.4.0 data.table_1.16.0
## [70] hms_1.1.3 utf8_1.2.4 spatstat.geom_3.3-2
## [73] RcppAnnoy_0.0.22 ggrepel_0.9.5 RANN_2.6.2
## [76] pillar_1.9.0 vroom_1.6.5 spam_2.10-0
## [79] RcppHNSW_0.6.0 later_1.3.2 splines_4.4.0
## [82] lattice_0.22-6 bit_4.0.5 renv_1.0.7
## [85] survival_3.7-0 deldir_2.0-4 tidyselect_1.2.1
## [88] miniUI_0.1.1.1 pbapply_1.7-2 knitr_1.48
## [91] gridExtra_2.3 scattermore_1.2 xfun_0.47
## [94] matrixStats_1.3.0 stringi_1.8.4 lazyeval_0.2.2
## [97] yaml_2.3.10 evaluate_0.24.0 codetools_0.2-20
## [100] BiocManager_1.30.25 cli_3.6.3 uwot_0.2.2
## [103] xtable_1.8-4 reticulate_1.38.0 munsell_0.5.1
## [106] jquerylib_0.1.4 Rcpp_1.0.13 globals_0.16.3
## [109] spatstat.random_3.3-1 png_0.1-8 spatstat.univar_3.0-0
## [112] parallel_4.4.0 dotCall64_1.1-1 listenv_0.9.1
## [115] viridisLite_0.4.2 scales_1.3.0 ggridges_0.5.6
## [118] crayon_1.5.3 leiden_0.4.3.1 rlang_1.1.4
## [121] cowplot_1.1.3